建構decision tree 模型

In [61]:
import pandas as pd 
df = pd.read_csv('C:\\Users\\User\\Desktop\\result1.csv')
In [62]:
df.head(5)
Out[62]:
Year Tm W L R/G BA OBP SLG OPS ADV
0 1998 ANA 85 77 4.86 0.272 0.335 0.415 0.751 0
1 1998 ARI 65 97 4.10 0.246 0.314 0.393 0.707 0
2 1998 ATL 106 56 5.10 0.272 0.342 0.453 0.795 1
3 1998 BAL 79 83 5.04 0.273 0.347 0.447 0.794 0
4 1998 BOS 92 70 5.41 0.280 0.348 0.463 0.810 1
In [63]:
X = df[['Tm','W','L', 'R/G', 'BA', 'OBP', 'SLG', 'OPS']]
y = df['ADV']
In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
In [65]:
X_train.head(5)
Out[65]:
Tm W L R/G BA OBP SLG OPS
191 FLA 83 79 4.43 0.264 0.329 0.406 0.736
607 CLE 91 71 5.05 0.259 0.332 0.434 0.766
619 OAK 97 65 5.02 0.252 0.325 0.439 0.764
611 KCR 58 104 3.94 0.245 0.305 0.392 0.697
7 CIN 77 85 4.63 0.262 0.337 0.402 0.739
In [66]:
X_train_NoName = X_train[['W','L', 'R/G', 'BA', 'OBP', 'SLG', 'OPS']]
In [67]:
X_train_NoName.head(5)
Out[67]:
W L R/G BA OBP SLG OPS
191 83 79 4.43 0.264 0.329 0.406 0.736
607 91 71 5.05 0.259 0.332 0.434 0.766
619 97 65 5.02 0.252 0.325 0.439 0.764
611 58 104 3.94 0.245 0.305 0.392 0.697
7 77 85 4.63 0.262 0.337 0.402 0.739
In [68]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth = 5, n_estimators = 10)
rfc.fit(X_train_NoName, y_train)
Out[68]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
In [69]:
X_test_NoName = X_test[['W','L', 'R/G', 'BA', 'OBP', 'SLG', 'OPS']]
In [70]:
X_test_NoName.head(5)
Out[70]:
W L R/G BA OBP SLG OPS
338 92 70 4.96 0.261 0.343 0.441 0.784
283 94 68 5.07 0.284 0.345 0.417 0.762
67 85 77 5.06 0.274 0.343 0.447 0.790
502 77 85 3.30 0.226 0.292 0.342 0.634
498 79 83 3.88 0.239 0.308 0.364 0.673
In [71]:
print('The accuracy of Random Forest Classifier on testing set:', rfc.score(X_test_NoName, y_test))
The accuracy of Random Forest Classifier on testing set: 0.8734177215189873
In [72]:
testResults = rfc.predict(X_test_NoName)
In [73]:
X_test['預測'] = testResults
X_test['真實'] = y_test
C:\Users\user\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
C:\Users\user\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
In [74]:
X_test['不同'] = X_test['預測'] - X_test['真實']
C:\Users\user\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [75]:
X_test.loc[X_test['不同'] != 0]
Out[75]:
Tm W L R/G BA OBP SLG OPS 預測 真實 不同
514 CHW 76 86 3.84 0.250 0.306 0.380 0.686 0 1 -1
228 NYM 83 79 4.46 0.258 0.322 0.416 0.738 0 1 -1
618 NYM 77 85 4.17 0.234 0.312 0.389 0.701 0 1 -1
382 SDP 90 72 4.10 0.246 0.317 0.371 0.689 1 0 1
377 NYY 95 67 5.30 0.267 0.350 0.436 0.786 1 0 1
604 CHW 62 100 4.05 0.241 0.302 0.401 0.703 0 1 -1
393 BOS 90 72 5.40 0.280 0.349 0.461 0.810 1 0 1
347 NYY 103 59 5.65 0.283 0.362 0.478 0.839 1 0 1
6 CHC 90 73 5.10 0.264 0.337 0.433 0.771 1 0 1
574 CHW 67 95 4.36 0.256 0.314 0.417 0.731 0 1 -1
578 COL 87 75 5.09 0.273 0.338 0.444 0.781 0 1 -1
586 MIN 85 77 5.03 0.260 0.334 0.434 0.768 0 1 -1
623 SEA 89 73 4.18 0.254 0.314 0.408 0.722 1 0 1
527 NYY 87 75 4.72 0.251 0.323 0.421 0.744 0 1 -1
199 NYM 71 91 4.22 0.249 0.317 0.409 0.726 0 1 -1
626 TBR 90 72 4.42 0.258 0.333 0.406 0.740 1 0 1
198 NYY 101 61 5.54 0.268 0.353 0.458 0.811 1 0 1
254 LAD 88 74 5.06 0.276 0.348 0.432 0.781 0 1 -1
124 BOS 93 69 5.30 0.277 0.345 0.444 0.789 1 0 1
78 NYY 87 74 5.41 0.277 0.354 0.450 0.804 0 1 -1
In [76]:
X_test.loc[X_test['不同'] == 0]
Out[76]:
Tm W L R/G BA OBP SLG OPS 預測 真實 不同
338 COL 92 70 4.96 0.261 0.343 0.441 0.784 1 1 0
283 LAA 94 68 5.07 0.284 0.345 0.417 0.762 1 1 0
67 CIN 85 77 5.06 0.274 0.343 0.447 0.790 0 0 0
502 SDP 77 85 3.30 0.226 0.292 0.342 0.634 0 0 0
498 NYM 79 83 3.88 0.239 0.308 0.364 0.673 0 0 0
... ... ... ... ... ... ... ... ... ... ... ...
373 LAA 80 82 4.20 0.248 0.311 0.390 0.702 0 0 0
250 FLA 78 84 4.68 0.264 0.331 0.435 0.766 0 0 0
111 PHI 86 76 4.60 0.260 0.329 0.414 0.743 0 0 0
376 MIN 94 68 4.82 0.273 0.341 0.422 0.762 1 1 0
565 STL 86 76 4.81 0.255 0.325 0.442 0.767 0 0 0

138 rows × 11 columns

In [77]:
estimator = clf.estimators_[1]
from sklearn.tree import export_graphviz
export_graphviz(estimator,out_file='baseball.dot',                             
                              feature_names = X_train_NoName.columns,
                              class_names = [str(i) for i in y_test.unique()],
                              filled=True, 
                              rounded=True,  
                              special_characters=True)
In [78]:
from subprocess import call
call(['dot', '-Tpng', 'baseball.dot', '-o', 'baseball.png', '-Gdpi=600'])
Out[78]:
0
In [79]:
from IPython.display import Image
Image(filename = 'baseball.png')
Out[79]:
In [ ]: